load('/data/nycdoe/clean_data/school_percentile_for_thoa.Rdata')
summary(school_percentile)
##      dbn              percentile             rank       
##  Length:2036        Min.   :  0.09934   Min.   :   1.0  
##  Class :character   1st Qu.: 27.32366   1st Qu.: 509.8  
##  Mode  :character   Median : 48.87417   Median :1018.5  
##                     Mean   : 49.00269   Mean   :1018.5  
##                     3rd Qu.: 70.77172   3rd Qu.:1527.2  
##                     Max.   :100.00000   Max.   :2036.0
# bigger value for `rank` means worse school (percentile is lower). Highest percentile - 100 - is ranked number 1.
school_percentile %>%
  filter(rank == 1)
## # A tibble: 1 x 3
##      dbn percentile  rank
##    <chr>      <dbl> <dbl>
## 1 75M444        100     1
load('/data/nycdoe/clean_data/avg_all_long.Rdata')

How good are the students who apply to a school?

Note: we only care about students who put the school on their First choice in First round.

load("/data/nycdoe/clean_data/eightgrade_apps_long_w_dbn.Rdata")
summary(eightgrade_apps_long_w_dbn)
##       year      student_id_scram    feeder_dbn         grade_level
##  Min.   :2005   Length:9421536     Length:9421536     Min.   :8   
##  1st Qu.:2007   Class :character   Class :character   1st Qu.:8   
##  Median :2010   Mode  :character   Mode  :character   Median :8   
##  Mean   :2010                                         Mean   :8   
##  3rd Qu.:2013                                         3rd Qu.:8   
##  Max.   :2015                                         Max.   :8   
##  feeder_dbn(june)      choice          program_applied   
##  Length:9421536     Length:9421536     Length:9421536    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      dbn               pgname          school_name       
##  Length:9421536     Length:9421536     Length:9421536    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##    interest            method         
##  Length:9421536     Length:9421536    
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
head(eightgrade_apps_long_w_dbn)
## # A tibble: 6 x 12
##    year student_id_scram feeder_dbn grade_level `feeder_dbn(june)`
##   <dbl>            <chr>      <chr>       <int>              <chr>
## 1  2005        489302451     10X206           8             10X206
## 2  2005        626402037     10X206           8             10X206
## 3  2005        843402069     10X206           8             10X206
## 4  2005        967502441     10X206           8             10X206
## 5  2005        330502342     10X206           8             10X206
## 6  2005        799502792     10X206           8             10X206
## # ... with 7 more variables: choice <chr>, program_applied <chr>,
## #   dbn <chr>, pgname <chr>, school_name <chr>, interest <chr>,
## #   method <chr>
# so this table has from r1programcode1 up to r1programcode12 only
tail(eightgrade_apps_long_w_dbn$choice)
## [1] "r1programcode12" "r1programcode12" "r1programcode12" "r1programcode12"
## [5] "r1programcode12" "r1programcode12"
# 104,378 rows (1.108%) missing school names. These come from 31 dbns (and programs) with "Missing" school names.
eightgrade_apps_long_w_dbn %>%
  filter(school_name == "Missing") %>% distinct(dbn, program_applied)
## # A tibble: 31 x 2
##    program_applied    dbn
##              <chr>  <chr>
##  1            X74A 12X690
##  2            X67J 07X655
##  3            X40A 11X541
##  4            M56A 02M620
##  5            X82X 12X262
##  6            X60O 10X660
##  7            X54A 09X414
##  8            M24F 02M460
##  9            M72A 05M469
## 10            X12Z 11X415
## # ... with 21 more rows
# join the student's performance to the application data 
rm(perschool_appquality)
## Warning in rm(perschool_appquality): object 'perschool_appquality' not
## found
perschool_appquality <-
  eightgrade_apps_long_w_dbn %>%
  filter(choice == "r1programcode1") %>% #because we only care about what students put in r1r1
  group_by(year, dbn) %>%
  select(year, dbn, school_name, program_applied, pgname, student_id_scram) %>%
  inner_join(avg_all_percentile, by=c("student_id_scram", "year")) %>%
  arrange(year, dbn, performance)

# thanks to `inner_join`, there is no row with missing student's performance
summary(perschool_appquality)
##       year          dbn            school_name        program_applied   
##  Min.   :2005   Length:760199      Length:760199      Length:760199     
##  1st Qu.:2007   Class :character   Class :character   Class :character  
##  Median :2010   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2010                                                           
##  3rd Qu.:2013                                                           
##  Max.   :2015                                                           
##     pgname          student_id_scram    grade_level  performance   
##  Length:760199      Length:760199      Min.   :8    Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:8    1st Qu.:26.06  
##  Mode  :character   Mode  :character   Median :8    Median :45.20  
##                                        Mean   :8    Mean   :45.66  
##                                        3rd Qu.:8    3rd Qu.:64.76  
##                                        Max.   :8    Max.   :99.92
perschool_appquality_medianPerf <-
  perschool_appquality %>%
  group_by(year, dbn) %>%
  summarize(median_stud_perf = median(performance),
            mean_stud_perf = mean(performance),
            top5_stud_perf = quantile(performance, 0.95)) 
summary(perschool_appquality_medianPerf)
##       year          dbn            median_stud_perf mean_stud_perf  
##  Min.   :2005   Length:4188        Min.   : 1.977   Min.   : 2.162  
##  1st Qu.:2008   Class :character   1st Qu.:26.780   1st Qu.:29.441  
##  Median :2010   Mode  :character   Median :33.508   Median :35.222  
##  Mean   :2010                      Mean   :35.170   Mean   :36.685  
##  3rd Qu.:2013                      3rd Qu.:41.522   3rd Qu.:42.078  
##  Max.   :2015                      Max.   :82.871   Max.   :82.871  
##  top5_stud_perf  
##  Min.   : 2.162  
##  1st Qu.:61.963  
##  Median :70.003  
##  Mean   :69.498  
##  3rd Qu.:77.909  
##  Max.   :96.923

Most schools receive applicants with performance in 20 - 40 (out of 100) range.

# plot it!
yr = 2015

(plot1 <-
  perschool_appquality_medianPerf %>%
  filter(year == yr) %>% 
  ggplot() +
  geom_histogram(aes(x = median_stud_perf), bins = 80) +
  labs(title = paste("Distribution of Applicant Quality among Schools in", yr),
       x = "Median applicant's performance",
       y = "Number of schools")) 

ggsave(filename = "app_quality_2015.pdf", plot = plot1, width = 8, height = 6)

# join in percentile of the schools themselves
rm(temp)
## Warning in rm(temp): object 'temp' not found
temp <- perschool_appquality_medianPerf %>%
  inner_join(school_percentile, by="dbn") %>%
  rename("percentile_school" = percentile)

Across all years, there are 2,792 schools with info on applicants’ performance.

summary(temp)
##       year          dbn            median_stud_perf mean_stud_perf  
##  Min.   :2005   Length:4132        Min.   : 1.977   Min.   : 2.162  
##  1st Qu.:2008   Class :character   1st Qu.:26.765   1st Qu.:29.429  
##  Median :2010   Mode  :character   Median :33.529   Median :35.220  
##  Mean   :2010                      Mean   :35.198   Mean   :36.701  
##  3rd Qu.:2013                      3rd Qu.:41.623   3rd Qu.:42.190  
##  Max.   :2015                      Max.   :82.536   Max.   :78.937  
##  top5_stud_perf   percentile_school      rank       
##  Min.   : 2.162   Min.   : 1.258    Min.   :   7.0  
##  1st Qu.:61.894   1st Qu.:32.900    1st Qu.: 556.0  
##  Median :69.932   Median :49.042    Median :1015.0  
##  Mean   :69.474   Mean   :50.904    Mean   : 981.9  
##  3rd Qu.:77.909   3rd Qu.:68.228    3rd Qu.:1404.0  
##  Max.   :96.923   Max.   :98.871    Max.   :2019.0

The number of schools applied to steadily increases over years. From 300 in 2015 to more than 400 in 2015. (but note, the actual number could be higher, because these are just schools with available applicant’s performance.)

(plot0 <- 
  temp %>% 
  count(year) %>% 
  ggplot(aes(x = as.factor(year), y = n)) + 
  geom_point() +
  geom_text(aes(label = n), vjust = -0.7) + 
  labs(title = "Number of High Schools Applied to over Years", x = "Year", y = "Number of High Schools",
       subtitle = "Note: the actual number could be higher; these are just schools w/ available applicants' performance"))

ggsave(filename = "count_schools_applied_over_years.pdf", plot = plot0, width = 8, height = 6)

Better schools generally receive better applicants

# plot student's performance against school's percentile (note: there are 418 points in this plot)
(plot2 <-
  temp %>%  
  filter(year == yr) %>%
  ggplot(aes(y = median_stud_perf, x = percentile_school)) +
  geom_point(size = 2) +
  labs(title = paste("School percentile VS. Applicant quality in", yr), 
       x = "School percentile (better schools on the right)",
       y = "Typical applicant quality (higher quality on top)") +
  geom_smooth())
## `geom_smooth()` using method = 'loess'

ggsave(filename = "school_pctile_vs_applicant_quality_2015.pdf", plot = plot2, width = 8, height = 6)
## `geom_smooth()` using method = 'loess'

How ambitious are students’ applications?

1. Distribution of ambitiousness among all applicants

# load("/data/nycdoe/clean_data/eightgrade_apps_long_w_dbn.Rdata")
# summary(eightgrade_apps_long_w_dbn)

rm(eightgrade_apps_long_w_dbn_percentile)
## Warning in rm(eightgrade_apps_long_w_dbn_percentile): object
## 'eightgrade_apps_long_w_dbn_percentile' not found
eightgrade_apps_long_w_dbn_percentile <-
  eightgrade_apps_long_w_dbn %>%
  left_join(school_percentile, by="dbn") %>%
  select(year, student_id_scram, choice, program_applied, dbn, "percentile_school" = percentile)
head(eightgrade_apps_long_w_dbn_percentile)
## # A tibble: 6 x 6
##    year student_id_scram         choice program_applied    dbn
##   <dbl>            <chr>          <chr>           <chr>  <chr>
## 1  2005        489302451 r1programcode1            X31A 07X548
## 2  2005        626402037 r1programcode1            X86R 10X268
## 3  2005        843402069 r1programcode1            X39A 10X549
## 4  2005        967502441 r1programcode1            X36A 09X403
## 5  2005        330502342 r1programcode1            X86R 10X268
## 6  2005        799502792 r1programcode1            M57B 02M542
## # ... with 1 more variables: percentile_school <dbl>
# among ~9.4 million applications, ~4.6 million don't have percentile_school
summary(eightgrade_apps_long_w_dbn_percentile)  
##       year      student_id_scram      choice          program_applied   
##  Min.   :2005   Length:9421536     Length:9421536     Length:9421536    
##  1st Qu.:2007   Class :character   Class :character   Class :character  
##  Median :2010   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2010                                                           
##  3rd Qu.:2013                                                           
##  Max.   :2015                                                           
##                                                                         
##      dbn            percentile_school
##  Length:9421536     Min.   : 1       
##  Class :character   1st Qu.:44       
##  Mode  :character   Median :67       
##                     Mean   :63       
##                     3rd Qu.:81       
##                     Max.   :99       
##                     NA's   :4617901
# # sanity check: anyone not list choice1 but list choice2, etc.? YES! There are a good number of people who do this. 
# eightgrade_apps_long_w_dbn %>%
#   mutate(foo = is.na(program_applied)) %>%
#   #select(year, student_id_scram, choice, program_applied, foo) %>%
#   filter(year == 2005) %>%
#   arrange(student_id_scram) %>%
#   group_by(student_id_scram) %>%
#   summarize(bar = length(rle(foo)),
#             baz = sum(lag(foo, default=0) != foo)) %>%
#   filter(bar > 2 | baz > 1)
# 
# # Because there are so many cases like this, we now only care that applicants MUST NOT SKIP TOP1 CHOICE if they ever list any choice at all.
# eightgrade_apps_long_w_dbn_notskip_r1r1 <-
#   eightgrade_apps_long_w_dbn %>%
#   group_by(year, student_id_scram) %>%
#   mutate(foo = is.na(program_applied)) %>%
#   #select(year, student_id_scram, choice, program_applied, foo) %>%
#   filter(year == 2005) %>%
#   arrange(student_id_scram) %>%
#   group_by(student_id_scram)
# who_skip_r1r1 <-
#   eightgrade_apps_long_w_dbn %>%
#   filter(choice == "r1pgrogramcode1" & is.na(program_applied)) %>%
#   left_join(eightgrade_apps_long_w_dbn, by=c("year", "student_id_scram"))
# 
# eightgrade_apps_long_w_dbn[which(eightgrade_apps_long_w_dbn$choice == "r1programcode1")]
#     
# eightgrade_apps_long_w_dbn %>%
#   filter(student_id_scram == "102112789")

# compute "application score"
perapp_aspiration <-
  eightgrade_apps_long_w_dbn_percentile %>%
  group_by(year, student_id_scram) %>%
  summarize(schoolpercentile_of_topchoice = percentile_school[1],
            median_schoolpercentile = median(percentile_school, na.rm=TRUE)) 
head(perapp_aspiration)
## # A tibble: 6 x 4
## # Groups:   year [1]
##    year student_id_scram schoolpercentile_of_topchoice
##   <dbl>            <chr>                         <dbl>
## 1  2005        100162233                      42.68126
## 2  2005        100202069                      72.81122
## 3  2005        100262015                      66.64103
## 4  2005        100262063                            NA
## 5  2005        100262096                      35.29412
## 6  2005        100262143                      86.73051
## # ... with 1 more variables: median_schoolpercentile <dbl>
summary(perapp_aspiration)
##       year      student_id_scram   schoolpercentile_of_topchoice
##  Min.   :2005   Length:785128      Min.   : 1.26                
##  1st Qu.:2007   Class :character   1st Qu.:50.07                
##  Median :2010   Mode  :character   Median :70.45                
##  Mean   :2010                      Mean   :67.23                
##  3rd Qu.:2013                      3rd Qu.:84.71                
##  Max.   :2015                      Max.   :98.87                
##                                    NA's   :84011                
##  median_schoolpercentile
##  Min.   : 1.26          
##  1st Qu.:49.59          
##  Median :67.51          
##  Mean   :63.98          
##  3rd Qu.:77.96          
##  Max.   :99.04          
##  NA's   :36597
# missing_topchoice <-
#   eightgrade_apps_long_w_dbn_percentile %>%
#   filter(year == 2010) %>%
#   group_by(student_id_scram) %>%
#   summarize(schoolpercentile_of_topchoice = percentile_school[1]) %>%
#   filter(is.na(schoolpercentile_of_topchoice))
# 
# eightgrade_apps_long_w_dbn_percentile %>%
#   filter(year == 2010 & student_id_scram %in% missing_topchoice$student_id_scram & choice == "r1programcode1") %>%
#   count(is.na(program_applied))
# # 

# 47,414 rows like this, so it seems missing topchoice-percentile does NOT cover missing median-percentile
perapp_aspiration %>%
  filter(is.na(schoolpercentile_of_topchoice) & !is.na(median_schoolpercentile))
## # A tibble: 47,414 x 4
## # Groups:   year [11]
##     year student_id_scram schoolpercentile_of_topchoice
##    <dbl>            <chr>                         <dbl>
##  1  2005        100262063                            NA
##  2  2005        100362307                            NA
##  3  2005        100462043                            NA
##  4  2005        100462514                            NA
##  5  2005        100462613                            NA
##  6  2005        100462905                            NA
##  7  2005        100472674                            NA
##  8  2005        100652081                            NA
##  9  2005        100762013                            NA
## 10  2005        100762089                            NA
## # ... with 47,404 more rows, and 1 more variables:
## #   median_schoolpercentile <dbl>
# for now, we will just drop all NA, no matter which column it is in (~84k rows are dropped)
perapp_aspiration_dropNA <-
  perapp_aspiration %>%
  filter(!is.na(schoolpercentile_of_topchoice) & !is.na(median_schoolpercentile))
summary(perapp_aspiration_dropNA)
##       year      student_id_scram   schoolpercentile_of_topchoice
##  Min.   :2005   Length:701117      Min.   : 1.258               
##  1st Qu.:2008   Class :character   1st Qu.:50.068               
##  Median :2010   Mode  :character   Median :70.451               
##  Mean   :2010                      Mean   :67.226               
##  3rd Qu.:2013                      3rd Qu.:84.713               
##  Max.   :2015                      Max.   :98.871               
##  median_schoolpercentile
##  Min.   : 1.258         
##  1st Qu.:50.376         
##  Median :67.819         
##  Mean   :64.584         
##  3rd Qu.:78.762         
##  Max.   :98.871
# plot it:
yr = 2015

# method 1: "application score" = schoolpercentile_of_topchoice
(plot3 <- 
  perapp_aspiration_dropNA %>%
  filter(year == yr) %>%
  ggplot() +
  geom_histogram(aes(x = schoolpercentile_of_topchoice), bins = 40) +
  labs(title = paste("Distribution of Aspiration in", yr),
       subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
       x = "Application's aspiration (based on the percentile of their top1 school)",
       y = "Number of students at this level") +
  scale_y_continuous(labels = scales::comma))

ggsave(filename = "ambitious_bytop1school_2015.pdf", plot = plot3, width = 8, height = 6)

# method 2: "application score" = median_schoolpercentile
(plot4 <-
  perapp_aspiration_dropNA %>%
  filter(year == yr) %>%
  ggplot() +
  geom_histogram(aes(x = median_schoolpercentile), bins = 40) +
  labs(title = paste("Distribution of Aspiration in", yr),
       subtitle = "Method 2: 'application score' = median_schoolpercentile",
       x = "Application's aspiration (based on the median percentile of all schools on list)",
       y = "Number of students at this level") +
  scale_y_continuous(labels = scales::comma))

ggsave(filename = "ambitious_bymedian_2015.pdf", plot = plot4, width = 8, height = 6)

2. Students’ aspirations vs. their performance

rm(aspiration_vs_perf)
## Warning in rm(aspiration_vs_perf): object 'aspiration_vs_perf' not found
aspiration_vs_perf <-
  perapp_aspiration_dropNA %>%
  left_join(avg_all_percentile, by=c("student_id_scram", "year"))
summary(aspiration_vs_perf)
##       year      student_id_scram   schoolpercentile_of_topchoice
##  Min.   :2005   Length:701117      Min.   : 1.258               
##  1st Qu.:2008   Class :character   1st Qu.:50.068               
##  Median :2010   Mode  :character   Median :70.451               
##  Mean   :2010                      Mean   :67.226               
##  3rd Qu.:2013                      3rd Qu.:84.713               
##  Max.   :2015                      Max.   :98.871               
##                                                                 
##  median_schoolpercentile  grade_level     performance   
##  Min.   : 1.258          Min.   :8       Min.   : 0.00  
##  1st Qu.:50.376          1st Qu.:8       1st Qu.:27.16  
##  Median :67.819          Median :8       Median :46.34  
##  Mean   :64.584          Mean   :8       Mean   :46.60  
##  3rd Qu.:78.762          3rd Qu.:8       3rd Qu.:65.71  
##  Max.   :98.871          Max.   :8       Max.   :99.92  
##                          NA's   :12576   NA's   :12576
head(aspiration_vs_perf)
## # A tibble: 6 x 6
## # Groups:   year [1]
##    year student_id_scram schoolpercentile_of_topchoice
##   <dbl>            <chr>                         <dbl>
## 1  2005        100162233                      42.68126
## 2  2005        100202069                      72.81122
## 3  2005        100262015                      66.64103
## 4  2005        100262096                      35.29412
## 5  2005        100262143                      86.73051
## 6  2005        100262355                      77.94118
## # ... with 3 more variables: median_schoolpercentile <dbl>,
## #   grade_level <int>, performance <dbl>
tail(aspiration_vs_perf)
## # A tibble: 6 x 6
## # Groups:   year [1]
##    year student_id_scram schoolpercentile_of_topchoice
##   <dbl>            <chr>                         <dbl>
## 1  2015        999902058                      69.42544
## 2  2015        999902280                      85.67031
## 3  2015        999902628                      55.09576
## 4  2015        999902751                      78.86457
## 5  2015        999912646                      96.71683
## 6  2015        999912769                      93.67305
## # ... with 3 more variables: median_schoolpercentile <dbl>,
## #   grade_level <int>, performance <dbl>
# there are 12,576 students (1.79%) with NA in performance. We will drop these.
aspiration_vs_perf <-
  aspiration_vs_perf %>%
  filter(!is.na(performance))


# plot it!
yr = 2015
# method 1: "application score" = schoolpercentile_of_topchoice
aspiration_vs_perf %>%
  filter(year == yr) %>%
  ggplot() +
  geom_point(aes(x = performance, y = schoolpercentile_of_topchoice), alpha = 0.2, color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
       y = "Application's aspiration (based on the percentile of their top1 school)",
       x = "Applicant's 8th-grade performance ranking")

# method 1 : improved version 1
aspiration_vs_perf %>%
  group_by(year, binned_studPerf = round(performance)/10) %>%
  summarise(mean_schoolpercentile_of_topchoice = mean(schoolpercentile_of_topchoice)) %>%
  filter(year == yr) %>%
  ggplot() +
  geom_point(aes(x = binned_studPerf, y = mean_schoolpercentile_of_topchoice), color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
       y = "Application's aspiration (based on the percentile of their top1 school)",
       x = "Applicant's 8th-grade performance ranking")

# method 1 : improved version 2 (seems a bit more nuanced)
(plot5 <-
  aspiration_vs_perf %>%
  mutate(binned_studPerf = as.factor(round(performance/10))) %>%
  filter(year == yr) %>%
  ggplot() +
  geom_boxplot(aes(x = binned_studPerf, y = schoolpercentile_of_topchoice), alpha = 0.2, color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 1: 'application score' = schoolpercentile_of_topchoice",
       y = "Application's aspiration (based on the percentile of their top1 school)",
       x = "Applicant's 8th-grade performance ranking"))

ggsave(filename = "ambitious_against_perf_2015_bytop1choice.pdf", plot = plot5, width = 8, height = 6)


# method 2: "application score" = median_schoolpercentile
aspiration_vs_perf %>%
  filter(year == yr) %>%
  ggplot() +
  geom_point(aes(x = performance, y = median_schoolpercentile), alpha = 0.2, color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 2: 'application score' = median_schoolpercentile",
       y = "Application's aspiration (based on the median percentile of schools on list)",
       x = "Applicant's 8th-grade percentile (based on GPA)")

# method 2: improved version 1
aspiration_vs_perf %>%
  group_by(year, binned_studPerf = round(performance)/10) %>%
  summarise(mean_median_schoolpercentile = mean(median_schoolpercentile)) %>%
  filter(year == yr) %>%
  ggplot() +
  geom_point(aes(x = binned_studPerf, y = mean_median_schoolpercentile), color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 2: 'application score' = median_schoolpercentile",
       y = "Application's aspiration (based on the median percentile of schools on list)",
       x = "Applicant's 8th-grade performance ranking")

# method 2: improved version 2 (seems a bit more nuanced)
(plot6 <-
  aspiration_vs_perf %>%
  mutate(binned_studPerf = as.factor(round(performance/10))) %>%
  filter(year == yr) %>%
  ggplot() +
  geom_boxplot(aes(x = binned_studPerf, y = median_schoolpercentile), alpha = 0.2, color = "dark blue") +
  labs(title = paste("Aspiration vs. Performance in", yr),
       subtitle = "Method 2: 'application score' = median_schoolpercentile",
       y = "Application's aspiration (based on the median percentile of schools on list)",
       x = "Applicant's 8th-grade performance ranking"))

ggsave(filename = "ambitious_against_perf_2015_bymedian.pdf", plot = plot6, width = 8, height = 6)